In [ ]:
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
Unpack data - this only works on linux and (maybe?) OS X. Unpack using 7zip on Windows.
In [ ]:
#! tar -xf data/aclImdb.tar.bz2 --directory data
In [ ]:
from sklearn.datasets import load_files
reviews_train = load_files("data/aclImdb/train/")
text_train, y_train = reviews_train.data, reviews_train.target
In [ ]:
print("Number of documents in training data: %d" % len(text_train))
print(np.bincount(y_train))
In [ ]:
reviews_test = load_files("data/aclImdb/test/")
text_test, y_test = reviews_test.data, reviews_test.target
print("Number of documents in test data: %d" % len(text_test))
print(np.bincount(y_test))
Subsample for interactivity
In [ ]:
text_train, y_train = text_train[::10], y_train[::10]
text_test, y_test = text_test[::10], y_test[::10]
In [ ]:
from IPython.display import HTML
In [ ]:
print(text_train[0])
In [ ]:
HTML(text_train[0].decode("utf-8"))
In [ ]:
print(y_train[0])
In [ ]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
cv.fit(text_train)
len(cv.vocabulary_)
In [ ]:
print(cv.get_feature_names()[:50])
print(cv.get_feature_names()[5000:5020])
In [ ]:
X_train = cv.transform(text_train)
X_train
In [ ]:
print(text_train[739])
In [ ]:
X_train[739].nonzero()[1]
In [ ]:
X_test = cv.transform(text_test)
In [ ]:
from sklearn.svm import LinearSVC
svm = LinearSVC()
svm.fit(X_train, y_train)
In [ ]:
svm.score(X_train, y_train)
In [ ]:
svm.score(X_test, y_test)
In [ ]:
def visualize_coefficients(classifier, feature_names, n_top_features=25):
# get coefficients with large absolute values
coef = classifier.coef_.ravel()
positive_coefficients = np.argsort(coef)[-n_top_features:]
negative_coefficients = np.argsort(coef)[:n_top_features]
interesting_coefficients = np.hstack([negative_coefficients, positive_coefficients])
# plot them
plt.figure(figsize=(15, 5))
colors = ["red" if c < 0 else "blue" for c in coef[interesting_coefficients]]
plt.bar(np.arange(2 * n_top_features), coef[interesting_coefficients], color=colors)
feature_names = np.array(feature_names)
plt.subplots_adjust(bottom=0.3)
plt.xticks(np.arange(1, 1 + 2 * n_top_features), feature_names[interesting_coefficients], rotation=60, ha="right");
In [ ]:
visualize_coefficients(svm, cv.get_feature_names())
In [ ]:
cv = CountVectorizer(ngram_range=(1, 2))
cv.fit(text_train)
len(cv.vocabulary_)
In [ ]:
X_test = cv.transform(text_test)
X_train = cv.transform(text_train)
In [ ]:
svm = LinearSVC()
svm.fit(X_train, y_train)
In [ ]:
visualize_coefficients(svm, cv.get_feature_names())
Grid search the C in the LinearSVC using the pipeline.
Vary the n_gram_range in the count vectorizer, visualize the changed coefficients.
In [ ]:
# %load solutions/text_pipeline.py